1 Principal Components

We will use USArrests data set which is a part of the base R.

library(ISLR)
states <- row.names(USArrests) 
names(USArrests)
## [1] "Murder"   "Assault"  "UrbanPop" "Rape"
apply(USArrests, 2, mean)
##   Murder  Assault UrbanPop     Rape 
##    7.788  170.760   65.540   21.232
apply(USArrests, 2, var) 
##     Murder    Assault   UrbanPop       Rape 
##   18.97047 6945.16571  209.51878   87.72916

There are several packages to conduct PCA analysis (prcomp, princomp, etc.) We standardize each variable using the option scale=TRUE

pr.out <- prcomp(USArrests, scale=TRUE)
names(pr.out) 
## [1] "sdev"     "rotation" "center"   "scale"    "x"

The center and scale components correspond to the means and standard deviations of the variables that were used for scaling prior to implementing PCA.

pr.out$center
##   Murder  Assault UrbanPop     Rape 
##    7.788  170.760   65.540   21.232
pr.out$scale
##    Murder   Assault  UrbanPop      Rape 
##  4.355510 83.337661 14.474763  9.366385

The rotation matrix provides the principal component loadings; each column of pr.out$rotation contains the corresponding principal component loading vector

pr.out$rotation
##                 PC1        PC2        PC3         PC4
## Murder   -0.5358995  0.4181809 -0.3412327  0.64922780
## Assault  -0.5831836  0.1879856 -0.2681484 -0.74340748
## UrbanPop -0.2781909 -0.8728062 -0.3780158  0.13387773
## Rape     -0.5434321 -0.1673186  0.8177779  0.08902432

Principle component score vectors:

dim(pr.out$x)  
## [1] 50  4

We can plot the first two principal components as follows

biplot(pr.out, scale=0, cex=0.6)

The scale=0 argument to biplot() ensures that the arrows are scaled to represent the loadings; other values for scale give slightly different biplots with different interpretations.

Notice that this figure is a mirror image of Figure 10.1. Recall that the principal components are only unique up to a sign change, so we can reproduce Figure 10.1 by making a few small changes

pr.out$rotation <- -pr.out$rotation
pr.out$x <- -pr.out$x
biplot(pr.out, scale=0, cex=0.6)

Importance of principal components:

summary(pr.out)
## Importance of components:
##                           PC1    PC2     PC3     PC4
## Standard deviation     1.5749 0.9949 0.59713 0.41645
## Proportion of Variance 0.6201 0.2474 0.08914 0.04336
## Cumulative Proportion  0.6201 0.8675 0.95664 1.00000

We see that the first principal component explains 62.0% of the variance in the data, the next principal component explains 24.7% of the variance, and so forth.

screeplot(pr.out, type="lines") 

pr.out$sdev
## [1] 1.5748783 0.9948694 0.5971291 0.4164494
pr.var <- pr.out$sdev^2
pr.var
## [1] 2.4802416 0.9897652 0.3565632 0.1734301
pve <- pr.var/sum(pr.var)
pve
## [1] 0.62006039 0.24744129 0.08914080 0.04335752
plot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained", ylim=c(0,1),type='b')

plot(cumsum(pve), xlab="Principal Component", ylab="Cumulative Proportion of Variance Explained", ylim=c(0,1),type='b')

1.1 Example: Body Data

This data set contains a set of body measurements along with age, height, weight and gender on 507 individuals (247 men 260 women). We expect some of these variables to be highly correlated.

# body data from 
# Journal of Statistics Education Volume 11, Number 2 (2003),
# www.amstat.org/publications/jse/v11n2/datasets.heinz.html

load("../Data/body.RData") 
bodydata <- subset(body, select = -c(age, gender, gender_dummy)) 
str(bodydata)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 507 obs. of  23 variables:
##  $ biacromial_diameter    : num  42.9 43.7 40.1 44.3 42.5 43.3 43.5 44.4 43.5 42 ...
##  $ biiliac_diameter       : num  26 28.5 28.2 29.9 29.9 27 30 29.8 26.5 28 ...
##  $ bitrochanteric_diameter: num  31.5 33.5 33.3 34 34 31.5 34 33.2 32.1 34 ...
##  $ chest_depth            : num  17.7 16.9 20.9 18.4 21.5 19.6 21.9 21.8 15.5 22.5 ...
##  $ chest_diameter         : num  28 30.8 31.7 28.2 29.4 31.3 31.7 28.8 27.5 28 ...
##  $ elbow_diameter         : num  13.1 14 13.9 13.9 15.2 14 16.1 15.1 14.1 15.6 ...
##  $ wrist_diameter         : num  10.4 11.8 10.9 11.2 11.6 11.5 12.5 11.9 11.2 12 ...
##  $ knee_diameter          : num  18.8 20.6 19.7 20.9 20.7 18.8 20.8 21 18.9 21.1 ...
##  $ ankle_diameter         : num  14.1 15.1 14.1 15 14.9 13.9 15.6 14.6 13.2 15 ...
##  $ shoulder_girth         : num  106 110 115 104 108 ...
##  $ chest_girth            : num  89.5 97 97.5 97 97.5 ...
##  $ waist_girth            : num  71.5 79 83.2 77.8 80 82.5 82 76.8 68.5 77.5 ...
##  $ abdominal_girth        : num  74.5 86.5 82.9 78.8 82.5 80.1 84 80.5 69 81.5 ...
##  $ hip_girth              : num  93.5 94.8 95 94 98.5 95.3 101 98 89.5 99.8 ...
##  $ thigh_girth            : num  51.5 51.5 57.3 53 55.4 57.5 60.9 56 50 59.8 ...
##  $ bicep_girth            : num  32.5 34.4 33.4 31 32 33 42.4 34.1 33 36.5 ...
##  $ forearm_girth          : num  26 28 28.8 26.2 28.4 28 32.3 28 26 29.2 ...
##  $ knee_girth             : num  34.5 36.5 37 37 37.7 36.6 40.1 39.2 35.5 38.3 ...
##  $ calf_girth             : num  36.5 37.5 37.3 34.8 38.6 36.1 40.3 36.7 35 38.6 ...
##  $ ankle_girth            : num  23.5 24.5 21.9 23 24.4 23.5 23.6 22.5 22 22.2 ...
##  $ wrist_girth            : num  16.5 17 16.9 16.6 18 16.9 18.8 18 16.5 16.9 ...
##  $ weight                 : num  65.6 71.8 80.7 72.6 78.8 74.8 86.4 78.4 62 81.6 ...
##  $ height                 : num  174 175 194 186 187 ...

Inspecting the correlation matrix we see that these features are in fact highly correlated.

library(ggcorrplot)
cormat <- round(cor(bodydata), 2)
ggcorrplot(cormat, hc.order = TRUE, type = "lower", outline.color = "white")

Find the principal components:

pr.out <- prcomp(bodydata, scale=TRUE)
summary(pr.out)  
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     3.8644 1.5758 1.03211 0.95475 0.69342 0.65851 0.58162
## Proportion of Variance 0.6493 0.1080 0.04632 0.03963 0.02091 0.01885 0.01471
## Cumulative Proportion  0.6493 0.7572 0.80357 0.84320 0.86410 0.88296 0.89767
##                           PC8     PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.5674 0.52727 0.52186 0.49794 0.45082 0.42002 0.39670
## Proportion of Variance 0.0140 0.01209 0.01184 0.01078 0.00884 0.00767 0.00684
## Cumulative Proportion  0.9117 0.92375 0.93559 0.94637 0.95521 0.96288 0.96972
##                           PC15   PC16    PC17    PC18   PC19    PC20    PC21
## Standard deviation     0.38094 0.3490 0.32002 0.29400 0.2835 0.23619 0.21552
## Proportion of Variance 0.00631 0.0053 0.00445 0.00376 0.0035 0.00243 0.00202
## Cumulative Proportion  0.97603 0.9813 0.98578 0.98954 0.9930 0.99546 0.99748
##                           PC22   PC23
## Standard deviation     0.19313 0.1437
## Proportion of Variance 0.00162 0.0009
## Cumulative Proportion  0.99910 1.0000
# change the signs of factor loadings
pr.out$rotation <- -pr.out$rotation
pr.out$x <- -pr.out$x
biplot(pr.out, scale=0, cex=0.6)

pr.var <- pr.out$sdev^2 
pve <- pr.var/sum(pr.var)
pve
##  [1] 0.6492846086 0.1079652790 0.0463153491 0.0396329172 0.0209057316
##  [6] 0.0188539644 0.0147079693 0.0139999780 0.0120875732 0.0118406532
## [11] 0.0107802578 0.0088363210 0.0076704395 0.0068423027 0.0063093336
## [16] 0.0052965553 0.0044528583 0.0037581598 0.0034951865 0.0024254119
## [21] 0.0020194249 0.0016217326 0.0008979928
plot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained", ylim=c(0,1),type='b')

plot(cumsum(pve), xlab="Principal Component", ylab="Cumulative Proportion of Variance Explained", ylim=c(0,1),type='b')

Approximately 85% of the total variation can be explained by the first 5 principal components.

1.2 Example: Regional Happiness Indicators

# load the data set
load("../Data/lifedata2015.RData")
# exclude the first column and send it to prcomp() function 
pr_happiness <- prcomp(lifedata2015[,-1], scale=TRUE)
summary(pr_happiness)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     3.6487 2.7994 1.61778 1.44839 1.31128 1.23677 1.08694
## Proportion of Variance 0.3247 0.1911 0.06383 0.05117 0.04194 0.03731 0.02882
## Cumulative Proportion  0.3247 0.5158 0.57968 0.63084 0.67278 0.71009 0.73890
##                            PC8     PC9    PC10   PC11    PC12    PC13    PC14
## Standard deviation     1.05930 0.99654 0.94201 0.9034 0.84682 0.78979 0.76944
## Proportion of Variance 0.02737 0.02422 0.02164 0.0199 0.01749 0.01521 0.01444
## Cumulative Proportion  0.76627 0.79049 0.81214 0.8320 0.84953 0.86474 0.87918
##                           PC15   PC16   PC17    PC18    PC19    PC20    PC21
## Standard deviation     0.76204 0.7189 0.6500 0.63497 0.59349 0.55320 0.51029
## Proportion of Variance 0.01416 0.0126 0.0103 0.00983 0.00859 0.00746 0.00635
## Cumulative Proportion  0.89335 0.9060 0.9163 0.92609 0.93468 0.94215 0.94850
##                          PC22    PC23    PC24    PC25    PC26    PC27    PC28
## Standard deviation     0.4958 0.48471 0.45005 0.43610 0.40353 0.39223 0.36695
## Proportion of Variance 0.0060 0.00573 0.00494 0.00464 0.00397 0.00375 0.00328
## Cumulative Proportion  0.9545 0.96022 0.96516 0.96980 0.97377 0.97752 0.98081
##                           PC29    PC30    PC31   PC32    PC33    PC34    PC35
## Standard deviation     0.33688 0.33084 0.30826 0.2862 0.27626 0.26303 0.24261
## Proportion of Variance 0.00277 0.00267 0.00232 0.0020 0.00186 0.00169 0.00144
## Cumulative Proportion  0.98358 0.98625 0.98856 0.9906 0.99242 0.99411 0.99555
##                           PC36   PC37    PC38    PC39    PC40    PC41
## Standard deviation     0.22405 0.1925 0.17796 0.16091 0.15358 0.11901
## Proportion of Variance 0.00122 0.0009 0.00077 0.00063 0.00058 0.00035
## Cumulative Proportion  0.99677 0.9977 0.99845 0.99908 0.99965 1.00000
# change the signs of factor loadings
# and plot the biplot
pr_happiness$rotation <- -pr_happiness$rotation
pr_happiness$x <- -pr_happiness$x
# biplot
biplot(pr_happiness, scale=0, cex=0.6)

# compute proportion of variance explained
pr.var <- pr_happiness$sdev^2 
pve <- pr.var/sum(pr.var)
pve
##  [1] 0.3247032515 0.1911377271 0.0638347339 0.0511669356 0.0419376566
##  [6] 0.0373075532 0.0288154928 0.0273685063 0.0242215157 0.0216434975
## [11] 0.0199036321 0.0174903839 0.0152139590 0.0144399072 0.0141635465
## [16] 0.0126038973 0.0103042710 0.0098339512 0.0085909019 0.0074641692
## [21] 0.0063510649 0.0059953400 0.0057303036 0.0049400620 0.0046386738
## [26] 0.0039715287 0.0037522269 0.0032841890 0.0027679669 0.0026696605
## [31] 0.0023177290 0.0019980467 0.0018614512 0.0016873924 0.0014356089
## [36] 0.0012243745 0.0009042744 0.0007723912 0.0006314968 0.0005752529
## [41] 0.0003454763
# plot PVE
plot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained", ylim=c(0,1),type='b')

# cumulative proportion of variance explained
plot(cumsum(pve), xlab="Principal Component", ylab="Cumulative Proportion of Variance Explained", ylim=c(0,1),type='b')

Approximately 80% of the variation in the data can be explained by the first 10 principal components.

An alternative package:

library(factoextra)
# PCA using factomineR
library(FactoMineR)
res_pca_lifedata <- PCA(lifedata2015[,-1],  graph = FALSE)
get_eig(res_pca_lifedata)
##         eigenvalue variance.percent cumulative.variance.percent
## Dim.1  13.31283331      32.47032515                    32.47033
## Dim.2   7.83664681      19.11377271                    51.58410
## Dim.3   2.61722409       6.38347339                    57.96757
## Dim.4   2.09784436       5.11669356                    63.08426
## Dim.5   1.71944392       4.19376566                    67.27803
## Dim.6   1.52960968       3.73075532                    71.00879
## Dim.7   1.18143520       2.88154928                    73.89034
## Dim.8   1.12210876       2.73685063                    76.62719
## Dim.9   0.99308215       2.42215157                    79.04934
## Dim.10  0.88738340       2.16434975                    81.21369
## Dim.11  0.81604892       1.99036321                    83.20405
## Dim.12  0.71710574       1.74903839                    84.95309
## Dim.13  0.62377232       1.52139590                    86.47448
## Dim.14  0.59203619       1.44399072                    87.91848
## Dim.15  0.58070541       1.41635465                    89.33483
## Dim.16  0.51675979       1.26038973                    90.59522
## Dim.17  0.42247511       1.03042710                    91.62565
## Dim.18  0.40319200       0.98339512                    92.60904
## Dim.19  0.35222698       0.85909019                    93.46813
## Dim.20  0.30603094       0.74641692                    94.21455
## Dim.21  0.26039366       0.63510649                    94.84966
## Dim.22  0.24580894       0.59953400                    95.44919
## Dim.23  0.23494245       0.57303036                    96.02222
## Dim.24  0.20254254       0.49400620                    96.51623
## Dim.25  0.19018562       0.46386738                    96.98009
## Dim.26  0.16283268       0.39715287                    97.37725
## Dim.27  0.15384130       0.37522269                    97.75247
## Dim.28  0.13465175       0.32841890                    98.08089
## Dim.29  0.11348664       0.27679669                    98.35768
## Dim.30  0.10945608       0.26696605                    98.62465
## Dim.31  0.09502689       0.23177290                    98.85642
## Dim.32  0.08191992       0.19980467                    99.05623
## Dim.33  0.07631950       0.18614512                    99.24237
## Dim.34  0.06918309       0.16873924                    99.41111
## Dim.35  0.05885996       0.14356089                    99.55467
## Dim.36  0.05019935       0.12243745                    99.67711
## Dim.37  0.03707525       0.09042744                    99.76754
## Dim.38  0.03166804       0.07723912                    99.84478
## Dim.39  0.02589137       0.06314968                    99.90793
## Dim.40  0.02358537       0.05752529                    99.96545
## Dim.41  0.01416453       0.03454763                   100.00000
# Scree plot
fviz_screeplot(res_pca_lifedata, addlabels = TRUE)

1.3 Principal Components Regression (PCR)

Let’s apply PCR approach to the regional happiness data. We want to predict provincial happiness levels using the first few principal components.

We can use pcr() function in the {pls} package for that purpose.

library(pls)
set.seed(123)

pcr_fit <- pcr(happiness_level ~ . -province, 
               data = lifedata2015, 
               scale=TRUE, 
               validation="LOO") # use leave-one-out cross-validation
summary(pcr_fit)
## Data:    X dimension: 81 40 
##  Y dimension: 81 1
## Fit method: svdpc
## Number of components considered: 40
## 
## VALIDATION: RMSEP
## Cross-validated using 81 leave-one-out segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            7.58    7.590    6.696    6.850    6.363    6.042    5.698
## adjCV         7.58    7.589    6.694    6.849    6.361    6.013    5.691
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV       5.744    5.565    5.438     5.108     5.056     5.191     5.114
## adjCV    5.744    5.557    5.431     5.090     5.047     5.185     5.102
##        14 comps  15 comps  16 comps  17 comps  18 comps  19 comps  20 comps
## CV        5.148     5.425     5.550     5.693     5.341     4.828     4.835
## adjCV     5.137     5.420     5.542     5.691     5.286     4.815     4.825
##        21 comps  22 comps  23 comps  24 comps  25 comps  26 comps  27 comps
## CV        4.857     4.903     4.981     5.068     5.231     5.398     5.404
## adjCV     4.847     4.897     4.970     5.058     5.220     5.387     5.392
##        28 comps  29 comps  30 comps  31 comps  32 comps  33 comps  34 comps
## CV        5.443     5.588     5.636     5.613     5.804     5.994     6.185
## adjCV     5.429     5.576     5.620     5.598     5.788     5.977     6.167
##        35 comps  36 comps  37 comps  38 comps  39 comps  40 comps
## CV        6.496     6.618     6.851     6.501     6.848     7.309
## adjCV     6.476     6.597     6.837     6.473     6.824     7.282
## 
## TRAINING: % variance explained
##                  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X                 33.217    52.13    58.61    63.06    67.28    71.10    73.98
## happiness_level    2.339    26.62    27.94    44.79    55.72    56.07    57.85
##                  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## X                  76.73    79.16     81.31     83.35     85.13     86.65
## happiness_level    60.82    63.14     66.85     67.15     67.50     68.60
##                  14 comps  15 comps  16 comps  17 comps  18 comps  19 comps
## X                   88.12     89.57     90.80     91.83     92.74     93.62
## happiness_level     69.02     69.44     71.35     71.63     77.56     77.60
##                  20 comps  21 comps  22 comps  23 comps  24 comps  25 comps
## X                   94.39     95.04     95.65     96.23     96.73     97.20
## happiness_level     77.63     77.71     77.72     78.16     78.24     78.46
##                  26 comps  27 comps  28 comps  29 comps  30 comps  31 comps
## X                   97.59     97.93     98.26     98.54     98.78     99.02
## happiness_level     78.53     79.28     79.89     79.91     80.54     80.80
##                  32 comps  33 comps  34 comps  35 comps  36 comps  37 comps
## X                   99.21     99.38     99.53     99.66     99.75     99.83
## happiness_level     80.81     80.84     80.93     81.07     81.07     81.12
##                  38 comps  39 comps  40 comps
## X                   99.90     99.96    100.00
## happiness_level     83.24     83.38     83.65
# how many principal components should we use? 
# RMSE as a function of the number of PC as computed using CV
validationplot(pcr_fit)

The minimum RMSE is achieved when we use 19 components.

pcr_fit <- pcr(happiness_level ~ . -province, 
               data = lifedata2015, 
               scale = TRUE, 
               ncomp = 19)
summary(pcr_fit)
## Data:    X dimension: 81 40 
##  Y dimension: 81 1
## Fit method: svdpc
## Number of components considered: 19
## TRAINING: % variance explained
##                  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X                 33.217    52.13    58.61    63.06    67.28    71.10    73.98
## happiness_level    2.339    26.62    27.94    44.79    55.72    56.07    57.85
##                  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## X                  76.73    79.16     81.31     83.35     85.13     86.65
## happiness_level    60.82    63.14     66.85     67.15     67.50     68.60
##                  14 comps  15 comps  16 comps  17 comps  18 comps  19 comps
## X                   88.12     89.57     90.80     91.83     92.74     93.62
## happiness_level     69.02     69.44     71.35     71.63     77.56     77.60

2 K-Means Clustering

The function kmeans() performs K-means clustering in R. We begin with a simple simulated example in which there truly are two clusters in the data: the first 25 observations have a mean shift relative to the next 25 observations.

# create a simulated data set
set.seed(2)
x <- matrix(rnorm(50*2), ncol=2)
x[1:25,1] <- x[1:25,1]+3
x[1:25,2] <- x[1:25,2]-4
# perfom Kmeans clustering with 2 centers and 20 random starts
km.out <- kmeans(x, centers = 2, nstart = 20)
# print cluster information 
km.out$cluster
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2
## [39] 2 2 2 2 2 2 2 2 2 2 2 2
# plot clusters
plot(x, col = (km.out$cluster+1), 
     main = "K-Means Clustering Results with K=2", 
     xlab = "", 
     ylab = "", 
     pch = 20, 
     cex = 2) 

In this example, we knew that there really were two clusters because we generated the data. However, for real data, in general we do not know the true number of clusters. We could instead have performed K-means clustering on this example with K = 3.

set.seed(4)
km.out <- kmeans(x, centers = 3, nstart = 20)
km.out
## K-means clustering with 3 clusters of sizes 17, 23, 10
## 
## Cluster means:
##         [,1]        [,2]
## 1  3.7789567 -4.56200798
## 2 -0.3820397 -0.08740753
## 3  2.3001545 -2.69622023
## 
## Clustering vector:
##  [1] 1 3 1 3 1 1 1 3 1 3 1 3 1 3 1 3 1 1 1 1 1 3 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2
## [39] 2 2 2 2 2 3 2 3 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 25.74089 52.67700 19.56137
##  (between_SS / total_SS =  79.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
# plot the results with k=3
plot(x, col = (km.out$cluster+1), 
     main = "K-Means Clustering Results with K=3", 
     xlab = "", 
     ylab = "", 
     pch = 20, 
     cex = 2)

When K = 3, K-means clustering splits up the two clusters. It is strongly recommended to run K-means clustering with a large value of nstart, such as 20 or 50, since otherwise an undesirable local optimum may be obtained.

# compute and print within sum of squares 
set.seed(3)
km.out <- kmeans(x,3,nstart=1)
km.out$tot.withinss
## [1] 97.97927
km.out <- kmeans(x,3,nstart=20)
km.out$tot.withinss
## [1] 97.97927

2.1 Example: Bodydata

# the data set was loaded previously
km.out <- kmeans(bodydata, centers = 2, nstart = 20)
plot(bodydata[,1:3], col=(km.out$cluster), cex=2)

Plot clustering based on weight and height and compare to the actual gender factor:

# Large blank circles are the resulting clusters
plot(bodydata[,22:23],col=(km.out$cluster),cex=2)
# put dots inside circles representing observed gender
# red = men, black = women
points(body[,23:24], col=c(1,2)[body$gender], pch=19, cex=1)

observed_class <- c(1,2)[body$gender]
km_cluster <- km.out$cluster
ratio <- sum(observed_class == km_cluster)/nrow(bodydata)
ratio
## [1] 0.8402367

84% of observations seem to be clustered correctly. Note that if we know the clusters we do not need to apply K-Means clustering method. This was just for demonstration purposes. When we know the labels (like gender in the bodydata) there are better classification methods which we’ve discussed in our previous classes.

2.2 Example: Province-level well-being data

In this example we will use Turkish province data set and its subset.

# load packages 
library(tidyverse)  # tidy data manipulation
library(cluster)    # clustering algorithms
library(factoextra) # clustering visualization 
# load data 
load("../Data/lifedata2015.RData") 
# make province names rownames
lifedata2015 <- column_to_rownames(lifedata2015, var = "province")
# use only happiness level and some selected variables 
happiness <- lifedata2015 %>% 
  select(happiness_level, life_exp, avr_daily_earnings, sat_rate_with_health_status)
head(happiness)
##                happiness_level life_exp avr_daily_earnings
## Adana                    53.00 77.39335           59.06489
## Adiyaman                 65.01 79.54820           53.24281
## Afyonkarahisar           76.43 76.99212           53.91157
## Agri                     60.09 75.62830           56.10804
## Amasya                   66.02 77.76714           53.77365
## Ankara                   56.23 79.37352           70.14222
##                sat_rate_with_health_status
## Adana                                68.47
## Adiyaman                             69.13
## Afyonkarahisar                       80.07
## Agri                                 66.20
## Amasya                               74.16
## Ankara                               71.76

Display the correlation matrix of the happiness data:

library(ggcorrplot)
cormat <- round(cor(happiness), 2)
ggcorrplot(cormat, hc.order = TRUE, type = "lower", outline.color = "white")

# visualize the distance function 
# blue: more similar; red: dissimilar 
# (look at the color legend, low value means more similar)
distance <- dist(scale(happiness))
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

# display few elements of distance matrix
as.matrix(distance)[1:6, 1:6]
##                   Adana Adiyaman Afyonkarahisar     Agri   Amasya   Ankara
## Adana          0.000000 2.768821       4.151958 2.060437 2.324195 2.686825
## Adiyaman       2.768821 0.000000       3.798892 3.916284 2.062164 2.889968
## Afyonkarahisar 4.151958 3.798892       0.000000 4.033355 2.057593 4.696319
## Agri           2.060437 3.916284       4.033355 0.000000 2.863795 4.407828
## Amasya         2.324195 2.062164       2.057593 2.863795 0.000000 3.253576
## Ankara         2.686825 2.889968       4.696319 4.407828 3.253576 0.000000

Apply K-means clustering:

# find cluster 
happiness_scaled <- scale(happiness)
kmeans_happy <- kmeans(happiness_scaled, centers = 2, nstart=20)
# bivariate scatter plots
plot(happiness[,1:4],col=(kmeans_happy$cluster),cex=2)

# cluster plot using principal components
# use factoextra::fviz_cluster() function to visualize clusters
# along with the first two principal components
fviz_cluster(kmeans_happy, data = happiness)

# cluster plot using principal components
# without ellipses
fviz_cluster(kmeans_happy, geom = "point", ellipse = FALSE, data = happiness) +
  ggtitle("K-means cluster of provinces with K=2")

# PCA using factomineR
library(FactoMineR)
res.pca <- PCA(happiness,  graph = FALSE)
get_eig(res.pca)
##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1  1.6875357        42.188394                    42.18839
## Dim.2  1.0669411        26.673528                    68.86192
## Dim.3  0.9111692        22.779229                    91.64115
## Dim.4  0.3343540         8.358849                   100.00000
# Scree plot
fviz_screeplot(res.pca, addlabels = TRUE, ylim = c(0, 50))

# cluster plot using original data 
happiness %>%
  mutate(cluster  = kmeans_happy$cluster,
         province = row.names(happiness)) %>%
  ggplot(aes(happiness_level, avr_daily_earnings, color = factor(cluster), label = province)) +
  geom_text()

# cluster plot using original data 
happiness %>%
  mutate(cluster  = kmeans_happy$cluster,
         province = row.names(happiness)) %>%
  ggplot(aes(happiness_level, life_exp, color = factor(cluster), label = province)) +
  geom_text()

# cluster plot using original data 
happiness %>%
  mutate(cluster  = kmeans_happy$cluster,
         province = row.names(happiness)) %>%
  ggplot(aes(happiness_level, sat_rate_with_health_status, color = factor(cluster), label = province)) +
  geom_text()

# compare and contrast different number of clusters
# plot different number of clusters 
k2 <- kmeans(happiness, centers = 2, nstart = 25)
k3 <- kmeans(happiness, centers = 3, nstart = 25)
k4 <- kmeans(happiness, centers = 4, nstart = 25)
k5 <- kmeans(happiness, centers = 5, nstart = 25)

# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = happiness) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point",  data = happiness) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point",  data = happiness) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point",  data = happiness) + ggtitle("k = 5")

library(gridExtra)
grid.arrange(p1, p2, p3, p4, nrow = 2)

# determine the optimal number of clusters 
set.seed(123)
# look for the elbow
fviz_nbclust(happiness, kmeans, method = "wss")

# for optimal k=3 compute summary statistics 
happiness %>%
  mutate(Cluster = k3$cluster) %>%
  group_by(Cluster) %>%
  summarise_all("mean")
## # A tibble: 3 x 5
##   Cluster happiness_level life_exp avr_daily_earnings sat_rate_with_health_stat~
##     <int>           <dbl>    <dbl>              <dbl>                      <dbl>
## 1       1            55.9     78.1               55.4                       69.3
## 2       2            69.1     78.2               55.8                       75.0
## 3       3            59.2     78.1               68.6                       73.6
# visualize K-means results with 4 clusters
fviz_cluster(k4, data = happiness,
             palette = c("#2E9FDF", "#28B463", "#E7B800", "#FC4E07"),
             ellipse.type = "euclid", # Concentration ellipse
             star.plot = TRUE,        # Add segments from centroids to items
             repel = TRUE,            # Avoid label overplotting 
             ggtheme = theme_minimal()
)

# summary results for k = 4 clusters 
k4$tot.withinss # total sum of sq.
## [1] 3438.896
k4$centers      # centers (means) of clusers
##   happiness_level life_exp avr_daily_earnings sat_rate_with_health_status
## 1        70.49913 77.95681           56.46359                    75.40783
## 2        50.89154 78.75987           56.77239                    66.14000
## 3        59.40156 78.02649           54.49261                    71.27375
## 4        59.18923 78.09380           68.62644                    73.59692
k4$size         # number of obs. in clusters
## [1] 23 13 32 13
# for k=4 add cluster results to the data set 
# and compute summary statistics 
happiness_cl <- happiness %>%
  mutate(cluster_kmeans = k4$cluster) 

happiness_cl %>%
  group_by(cluster_kmeans) %>%
  summarise_all("mean")
## # A tibble: 4 x 5
##   cluster_kmeans happiness_level life_exp avr_daily_earnin~ sat_rate_with_healt~
##            <int>           <dbl>    <dbl>             <dbl>                <dbl>
## 1              1            70.5     78.0              56.5                 75.4
## 2              2            50.9     78.8              56.8                 66.1
## 3              3            59.4     78.0              54.5                 71.3
## 4              4            59.2     78.1              68.6                 73.6

3 Hierarchical Clustering

The hclust() function implements hierarchical clustering in R.

# use previously created simulated data x (see kmeans clustering)
# clustering with different linkage functions 
hc.complete <- hclust(dist(x), method="complete")
hc.average <- hclust(dist(x), method="average")
hc.single <- hclust(dist(x), method="single")
hc.ward2 <- hclust(dist(x), method="ward.D2")

The option "method="ward.D2" performs Ward’s minimum variance method. In this method the total within-cluster variance is minimized. It merges the pair of clusters with minimum between-cluster distance at each step. (This option corresponds to cluster::agnes(..., method = "ward") in the {cluster} package.

Visualize them using base R plot function:

# plot them side-by-side using base R plot function 
# cex = label size
labsize = 0.8 
par(mfrow=c(2,2))
plot(hc.complete,main="Complete Linkage", xlab="", sub="", cex=labsize)
plot(hc.average, main="Average Linkage", xlab="", sub="", cex=labsize)
plot(hc.single, main="Single Linkage", xlab="", sub="", cex=labsize)
plot(hc.ward2, main="Ward Linkage", xlab="", sub="", cex=labsize)

# cut dendrogram into groups of data 
# k = 2 groups
cutree(hc.complete, k= 2)
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2
## [39] 2 2 2 2 2 2 2 2 2 2 2 2
cutree(hc.average, 2)
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 1 2 2 2 2 2
## [39] 2 2 2 2 2 1 2 1 2 2 2 2
cutree(hc.single, 2)
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [39] 1 1 1 1 1 1 1 1 1 1 1 1
# 4 clusters using single linkage
cutree(hc.single, 4)
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3
## [39] 3 3 3 4 3 3 3 3 3 3 3 3

To scale the variables before performing hierarchical clustering of the observations, we use the scale() function:

xsc <- scale(x)
plot(hclust(dist(xsc), method="complete"), main="Hierarchical Clustering with Scaled Features")

3.1 Example: Province-level Well-being data

# load packages 
library(tidyverse)  # data manipulation
library(cluster)    # clustering algorithms
library(factoextra) # clustering visualization 
# load data 
load("../Data/lifedata2015.RData") 
# make province names rownames
lifedata2015 <- column_to_rownames(lifedata2015, var = "province")
# hierarchical clustering 
# using stats::hclust package
# scale variables before dist() function
# can also use pipes, %>%, see example at the end
hc_complete <- hclust(dist(scale(lifedata2015)), method="complete")
hc_average <- hclust(dist(scale(lifedata2015)), method="average")
hc_single <- hclust(dist(scale(lifedata2015)), method="single")
hc_ward <- hclust(dist(scale(lifedata2015)), method="ward.D2")
labsize = 0.7
plot(hc_complete, main = "Complete Linkage", xlab="", sub="", cex=labsize)

plot(hc_average, main = "Average Linkage", xlab="", sub="", cex=labsize)

plot(hc_single, main = "Single Linkage", xlab="", sub="", cex=labsize)

plot(hc_ward, main = "Ward's Method", xlab="", sub="", cex=labsize)

# focus on a subset of variables 
# use only happiness level and some selected variables 
happiness <- lifedata2015 %>% 
  select(happiness_level, life_exp, avr_daily_earnings, sat_rate_with_health_status)
# cluster using hclust
hc_complete <- hclust(dist(scale(happiness)), method="complete")
hc_average <- hclust(dist(scale(happiness)), method="average")
hc_single <- hclust(dist(scale(happiness)), method="single")
hc_ward <- hclust(dist(scale(happiness)), method="ward.D2")
plot(hc_complete, main="Complete Linkage", xlab="", sub="", cex=labsize)

plot(hc_average, main="Average Linkage", xlab="", sub="", cex=labsize)

plot(hc_ward, main="Ward's method", xlab="", sub="", cex=labsize)

# form clusters  
ncluster = 2
cutree(hc_complete, ncluster)
##          Adana       Adiyaman Afyonkarahisar           Agri         Amasya 
##              1              1              2              2              1 
##         Ankara        Antalya         Artvin          Aydin      Balikesir 
##              1              1              1              1              2 
##        Bilecik         Bingöl         Bitlis           Bolu         Burdur 
##              1              1              1              2              1 
##          Bursa      Çanakkale        Çankiri          Çorum        Denizli 
##              1              1              2              1              1 
##     Diyarbakir         Edirne         Elazig       Erzincan        Erzurum 
##              1              1              1              1              1 
##      Eskisehir      Gaziantep        Giresun      Gümüshane        Hakkari 
##              1              2              1              1              2 
##          Hatay        Isparta         Mersin       Istanbul          Izmir 
##              1              2              1              1              1 
##           Kars      Kastamonu        Kayseri     Kirklareli       Kirsehir 
##              1              1              1              1              1 
##        Kocaeli          Konya        Kütahya        Malatya         Manisa 
##              1              2              2              1              1 
##  Kahramanmaras         Mardin          Mugla            Mus       Nevsehir 
##              1              1              1              1              1 
##          Nigde           Ordu           Rize        Sakarya         Samsun 
##              2              1              2              2              1 
##          Siirt          Sinop          Sivas       Tekirdag          Tokat 
##              1              2              1              1              1 
##        Trabzon        Tunceli      Sanliurfa           Usak            Van 
##              1              1              1              2              2 
##         Yozgat      Zonguldak        Aksaray        Bayburt        Karaman 
##              1              1              1              2              2 
##      Kirikkale         Batman         Sirnak         Bartin        Ardahan 
##              2              1              2              1              2 
##          Igdir         Yalova        Karabük          Kilis       Osmaniye 
##              1              2              1              2              1 
##          Düzce 
##              2
# plot dendrogram
plot(hc_complete)
# draw rectangles around clusters
rect.hclust(hc_complete , k = ncluster, border = 2:6) 

# colorize branches using dendextend library
library(dendextend)
avg_dend_obj <- as.dendrogram(hc_complete)
avg_col_dend <- color_branches(avg_dend_obj, h = 3)
plot(avg_col_dend)

# Cut tree into  groups
hc_cluster2 <- cutree(hc_complete, k = 2)

# Number of members in each cluster
table(hc_cluster2)
## hc_cluster2
##  1  2 
## 57 24
# visualize result in a scatter plot using factoextra package 
# 2 clusters
library(factoextra) 
fviz_cluster(list(data = happiness, cluster = hc_cluster2))

# we can add cluster information into data 
happiness %>%
  mutate(cluster = hc_cluster2) %>%
  head
##                happiness_level life_exp avr_daily_earnings
## Adana                    53.00 77.39335           59.06489
## Adiyaman                 65.01 79.54820           53.24281
## Afyonkarahisar           76.43 76.99212           53.91157
## Agri                     60.09 75.62830           56.10804
## Amasya                   66.02 77.76714           53.77365
## Ankara                   56.23 79.37352           70.14222
##                sat_rate_with_health_status cluster
## Adana                                68.47       1
## Adiyaman                             69.13       1
## Afyonkarahisar                       80.07       2
## Agri                                 66.20       2
## Amasya                               74.16       1
## Ankara                               71.76       1
# Cut tree into  3 groups
# use Ward method results
hc_cluster3 <- cutree(hc_ward, k = 3)

# Number of members in each cluster
table(hc_cluster3)
## hc_cluster3
##  1  2  3 
## 31 38 12
# visualize dendrogram 
# use factoextra::hcut() function 
hres3 <- hcut(happiness, k = 3, stand = TRUE)
# Visualize
fviz_dend(hres3, rect = TRUE, cex = 0.5,
          k_colors = c("#F1C40F","#28B463", "#E67E22"))

# for html color codes visit https://htmlcolorcodes.com/ 
# visualize result in a scatter plot using factoextra package 
# 3 clusters 
library(factoextra) 
fviz_cluster(list(data = happiness, cluster = hc_cluster3), 
             palette = c("#F1C40F", "#E67E22", "#28B463"), 
             repel = TRUE # avoid overplotting text labels
             )

# visualize dendrogram 
# use factoextra::hcut() function 
# cut into 4 clusters
hres4 <- hcut(happiness, k = 4, stand = TRUE) # default is hc_method = "ward.D2"
# Visualize
fviz_dend(hres4, cex = 0.5,
          k_colors = c("#F1C40F", "#28B463", "#E67E22", "#3498DB"), 
          color_labels_by_k = TRUE,
          rect = TRUE,
          rect_border = c("#F1C40F","#28B463", "#E67E22", "#3498DB"), 
          rect_fill = TRUE
         )

# horizontal dendrogram: 
# add horiz = TRUE option
fviz_dend(hres4, cex = 0.5, horiz = TRUE, 
          k_colors = c("#F1C40F", "#28B463", "#E67E22", "#3498DB"), 
          color_labels_by_k = TRUE,
          rect = TRUE,
          rect_border = c("#F1C40F","#28B463", "#E67E22", "#3498DB"), 
          rect_fill = TRUE
         )

# circular dendrogram   
# add type = "circular" option
fviz_dend(hres4, cex = 0.5, type = "circular", 
          k_colors = c("#F1C40F", "#28B463", "#E67E22", "#3498DB") 
         )

# Alternatively dendrogram may be cut inside the fviz_dend
fviz_dend(hc_ward,      # cluster results
          k = 4,        # desired number of clusters
          rect = TRUE,  # draw rectangles around clusters
          cex = 0.5     # label size (province names)
          )
# 4 clusters using Ward's method 
# Cut tree into  4 groups
hc_cluster_ward4 <- cutree(hc_ward, k = 4)
library(factoextra) 
fviz_cluster(list(data = happiness, cluster = hc_cluster_ward4),
          palette = c("#F1C40F", "#3498DB", "#E67E22", "#28B463"), 
          repel = TRUE)

# add hierarchical clustering result to the data set
happiness_cl <- happiness_cl %>% 
  mutate(cluster_hc = hres4$cluster)
# cluster means
happiness_cl %>% 
  select(-cluster_kmeans) %>% 
  group_by(cluster_hc) %>% 
  summarize_all(mean)
## # A tibble: 4 x 5
##   cluster_hc happiness_level life_exp avr_daily_earnings sat_rate_with_health_s~
##        <int>           <dbl>    <dbl>              <dbl>                   <dbl>
## 1          1            56.0     77.8               55.6                    68.4
## 2          2            60.2     79.2               58.0                    73.2
## 3          3            70.6     77.9               54.3                    74.7
## 4          4            62.8     77.4               67.4                    75.4

Illustration of using the pipe operator %>%:

# using pipes
# dend <- 1:10 %>% dist %>% hclust %>% as.dendrogram 
# dend %>% plot()

dend <- happiness %>%               # take the data
  scale() %>%                       # standardize
  dist %>%                          # calculate a distance matrix, 
  hclust(method = "complete") %>%   # hierarchical clustering using  
  as.dendrogram                     # turn it into a dendrogram.

dend %>% plot()